In [20]:
from datetime import datetime
import numpy as np
import pandas as pd

import plotly.graph_objects as go
import plotly.io as pio

import geopandas as gpd
import json

from ipywidgets import interactive, HBox, VBox, interact, widgets
from IPython.display import IFrame
In [2]:
pio.templates['rockwell'] = go.layout.Template(
    layout=go.Layout(
        font=dict(
            family='Rockwell',
            size=18,
            color='#2A5674'
        ),
        title={
            'yanchor': 'middle'},
    )
)
pio.templates.default = "plotly_white+rockwell"

Данные

Подготовим данные для визуализации - реальные каунты и прогнозы.

In [3]:
july = pd.date_range(datetime(2018,6,30,23), datetime(2018,7,31,23), freq='H')
In [4]:
# реальные каунты за июль
july_real_counts = pd.read_csv('NYC_TAXI_aggregated_data/pu_agg_data_pop.csv',
                               index_col=[0], parse_dates=[0]).loc[july[1:]]
july_real_counts.head()
Out[4]:
4 7 12 13 24 25 33 40 41 42 ... 244 246 249 255 256 260 261 262 263 264
2018-07-01 00:00:00 61.0 29.0 9.0 50.0 23.0 26.0 12.0 6.0 42.0 29.0 ... 16.0 159.0 546.0 56.0 41.0 10.0 41.0 28.0 154.0 131.0
2018-07-01 01:00:00 60.0 24.0 3.0 31.0 15.0 9.0 14.0 6.0 43.0 27.0 ... 11.0 172.0 498.0 53.0 46.0 8.0 31.0 11.0 129.0 100.0
2018-07-01 02:00:00 29.0 25.0 0.0 17.0 6.0 3.0 4.0 1.0 37.0 21.0 ... 12.0 207.0 361.0 53.0 38.0 9.0 19.0 12.0 100.0 93.0
2018-07-01 03:00:00 24.0 46.0 0.0 6.0 7.0 8.0 2.0 4.0 23.0 24.0 ... 6.0 198.0 251.0 42.0 29.0 8.0 10.0 5.0 55.0 54.0
2018-07-01 04:00:00 16.0 60.0 0.0 11.0 11.0 8.0 2.0 2.0 29.0 19.0 ... 4.0 126.0 138.0 36.0 29.0 21.0 8.0 12.0 72.0 33.0

5 rows × 83 columns

In [5]:
# прогнозы на июль
temp_july_forecasts = pd.read_csv('XGBoost/july_forecasts.csv', index_col=[0], parse_dates=[0])
temp_july_forecasts['zone'] = temp_july_forecasts['zone'].astype(str)
temp_july_forecasts.sample(5)
Out[5]:
real_counts prediction zone shift
pickup_datetime
2018-07-04 05:00:00 16.0 19.761364 7 2
2018-07-04 23:00:00 21.0 15.092179 137 5
2018-07-12 05:00:00 52.0 64.596780 148 5
2018-07-06 21:00:00 2.0 5.762766 223 1
2018-07-25 10:00:00 395.0 380.812770 186 6

Приведем прогнозы к более удобному формату для визуалиазации: разобьем по зонам и усредним прогнозы по моделям.

In [6]:
zones = july_real_counts.columns
In [7]:
july_forecasts = pd.DataFrame(index=july)
for zone in zones:
    temp = pd.DataFrame(index=july)
    for shift in range(1,7):
        prediction = temp_july_forecasts[(temp_july_forecasts['zone']==zone) &
                                         (temp_july_forecasts['shift']==shift)][['prediction']]
        temp['step_{}'.format(shift)] = prediction
    for shift in range(1,7):
        temp['step_{}'.format(shift)] = temp['step_{}'.format(shift)].shift(shift)
    
    july_forecasts[zone] = temp.mean(axis=1)
    
july_forecasts.columns.name = 'zone_id'
july_forecasts.index.name = 'pickup_datetime'
july_forecasts = july_forecasts.dropna()
july_forecasts.head()
Out[7]:
zone_id 4 7 12 13 24 25 33 40 41 42 ... 244 246 249 255 256 260 261 262 263 264
pickup_datetime
2018-07-01 00:00:00 59.892880 28.222006 6.981757 50.526620 23.650210 24.622015 11.987604 6.416379 42.445393 28.746181 ... 15.826271 159.395070 531.203060 62.719270 40.277400 9.939765 40.760353 29.404203 154.340700 131.305000
2018-07-01 01:00:00 70.649700 28.849571 3.012228 28.013958 18.972921 19.190006 10.734288 6.143519 41.954241 25.986612 ... 11.811124 190.910880 491.308100 65.595657 47.626486 8.387832 34.410505 16.872319 125.134350 106.782322
2018-07-01 02:00:00 54.035273 31.702317 1.550948 13.252006 14.913000 6.998274 7.010793 4.091743 37.571760 16.944605 ... 7.492383 212.060797 412.540343 56.803001 41.135819 11.730675 23.636683 9.092324 101.887935 86.326935
2018-07-01 03:00:00 26.519257 31.065535 0.213520 6.419751 7.353878 7.525317 2.948047 2.097734 28.281779 15.612941 ... 7.417816 216.090690 245.988330 42.896397 41.156631 9.134269 16.635297 8.608337 72.314247 72.822759
2018-07-01 04:00:00 17.245584 55.956693 0.312895 6.939186 8.469899 7.356421 2.957109 2.081662 20.180937 17.471050 ... 5.873148 137.849984 154.254624 37.427597 26.831558 16.065526 10.728466 14.514404 54.377585 38.370271

5 rows × 83 columns

Получили привычные временные ряды по каждой зоне.

In [8]:
del temp_july_forecasts

Визуализируем временные ряды

In [9]:
# функция для слайдера
def update_zone(zone_id):
    timeseries_plot.data[0].y = july_real_counts[zone_id]
    timeseries_plot.data[1].y = july_forecasts[zone_id]
In [10]:
# сам график
trace_real = go.Scatter(x = july,
                        y = july_real_counts['4'],
                        name = 'Real counts',
                        line=dict(width=2, color='#6785be')
                       )

trace_predicted = go.Scatter(x = july, 
                             y = july_forecasts['4'],
                             name = 'Predicted counts',
                             line=dict(width=2, dash='dot', color='#ba6657')
                            )

data = [trace_real, trace_predicted]

layout = dict(title = 'Real and predicted timerows of taxi zones (JULY 2018)',
              xaxis = dict(rangeslider = dict(visible = True),
                           type = 'date'),
              font = dict(family = 'Rockwell'),
              height=600,
              margin = dict(l = 20, r = 10, b = 0, t = 50, pad = 0),
             )

timeseries_plot = go.FigureWidget(data = data, layout = layout)
In [11]:
# виджет для выбора зоны
choose_zone_drop_down = interactive(update_zone,
                                    zone_id = widgets.Dropdown(options=zones,
                                                               value='4',
                                                               description='chooze zone_id',
                                                               disabled=False))
In [12]:
VBox([timeseries_plot, choose_zone_drop_down])
In [21]:
IFrame('exp/real_and_predicted_timerows.gif', 1000, 700, unconfined=True)
Out[21]:

Визуализируем карту

In [13]:
zones_gdf = gpd.read_file('NYC_TAXI_data/other_data/taxi_zones.geojson').loc[:,['zone', 'OBJECTID', 'borough','geometry']]
zones_gdf.columns = ['zone_name', 'zone_id', 'borough','geometry']
zones_gdf['zone_id'] = zones_gdf['zone_id'].astype(str)
NY_center_lat = (40.49612+40.91553)/2
NY_center_lon = (-74.25559-73.70001)/2
In [14]:
zones_gdf_with_forecast = zones_gdf.merge(july_forecasts.T.reset_index(), how='left', on='zone_id')
zones_gdf_with_forecast.columns = zones_gdf_with_forecast.columns.astype(str)
zones_gdf_with_forecast.sample(2)
Out[14]:
zone_name zone_id borough geometry 2018-07-01 00:00:00 2018-07-01 01:00:00 2018-07-01 02:00:00 2018-07-01 03:00:00 2018-07-01 04:00:00 2018-07-01 05:00:00 ... 2018-07-31 14:00:00 2018-07-31 15:00:00 2018-07-31 16:00:00 2018-07-31 17:00:00 2018-07-31 18:00:00 2018-07-31 19:00:00 2018-07-31 20:00:00 2018-07-31 21:00:00 2018-07-31 22:00:00 2018-07-31 23:00:00
127 Inwood Hill Park 128 Manhattan MULTIPOLYGON (((-73.92360 40.87890, -73.92362 ... NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
119 Highbridge Park 120 Manhattan POLYGON ((-73.92295 40.85885, -73.92283 40.856... NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

2 rows × 748 columns

In [15]:
# функция смены времени на карте
def update_datetime(dt):
    map_plot.data[0].z = zones_gdf_with_forecast[str(dt)]   
In [16]:
zones_geojson = json.loads(zones_gdf_with_forecast.to_json())
In [17]:
# карта
data = go.Choroplethmapbox(geojson=zones_geojson,
                           locations=zones_gdf_with_forecast['zone_id'],
                           z=zones_gdf_with_forecast[str(july[1])],
                           hovertext=zones_gdf_with_forecast['zone_name'],
                           hovertemplate='<b>Zone name</b>: <b>%{hovertext}</b>'+
                                           '<br><b>Zone ID </b>: %{location}'+
                                           "<extra></extra>",
                           showlegend=False,
                           autocolorscale=False,
                           colorscale='Viridis',
                           showscale=True,
                           marker_opacity=0.8, marker_line_width=0.1 )
layout = go.Layout(mapbox_style='carto-positron',
                   mapbox_zoom=9,
                   mapbox_center = {'lat': NY_center_lat, 'lon': NY_center_lon},
                   hoverlabel=dict(bgcolor="white", font_size=12, font_family='Rockwell'),
                   margin={"r":0,"t":100,"l":0,"b":0},
                   title='Most popular New York taxi zones (july 2018)<br>'+
                           '(colored by predicted trip count)',
                   height=600, width=600)

map_plot = go.FigureWidget(data=data, layout=layout)
In [18]:
# слайдер времени
datetime_slider = interactive(update_datetime, 
                              dt = widgets.SelectionSlider(options = july[1:24*7+1],
                                                           description = ' ',
                                                           layout=widgets.Layout(width='600px'),
                                                           style = {'description_width': 'initial'}
                                                          ))
In [19]:
VBox([map_plot, datetime_slider])
In [22]:
IFrame('exp/map_with_predictions.gif', 1000, 650, unconfined=True)
Out[22]:
In [ ]: